Final Predictions¶

No 2024 data was used during model training. Similarly, no player in the test pool was used during model training. The data/test.csv dataset should be a true holdout set and an accurate prediction of model performance.

Three modelĀ architectures were evaluated and the linear model was chosen as the winner.

InĀ [1]:
import joblib
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_squared_error

from bullpen import cv_utils, data_utils, model_utils, plot_utils

Load Data¶

InĀ [2]:
test_df = pd.read_csv(data_utils.DATA_DIR.joinpath("test.csv"))
test_df.head()
Out[2]:
PlayerId Team Season MLBAMID Name Age TBF K% Rk IP ... 02s 02h L/SO S/SO L/SO% 3pK 4pW PAu Pitu Stru
0 11490 SFG 2021 592717 Aaron Sanchez 28 156 0.166667 909 35.1 ... 15 2 8 18 0.308 2 5 0 0 0
1 11490 - - - 2022 592717 Aaron Sanchez 29 265 0.154717 847 60.0 ... 41 8 12 29 0.293 6 5 0 0 0
2 20132 OAK 2022 670124 Adam Oller 27 337 0.136499 712 74.1 ... 42 5 9 37 0.196 3 6 0 0 0
3 20132 MIA 2024 670124 Adam Oller 29 189 0.190476 720 42.1 ... 14 2 9 27 0.250 5 4 0 0 0
4 2233 STL 2021 425794 Adam Wainwright 39 828 0.210145 1079 206.1 ... 96 5 67 105 0.385 30 4 0 0 0

5 rows Ɨ 39 columns

InĀ [3]:
# Predictions can be made on the entire dataset, but 2024 is the year we care about
test_2024_df = test_df[test_df.Season == 2024].reset_index(drop=True)
test_2024_df
Out[3]:
PlayerId Team Season MLBAMID Name Age TBF K% Rk IP ... 02s 02h L/SO S/SO L/SO% 3pK 4pW PAu Pitu Stru
0 20132 MIA 2024 670124 Adam Oller 29 189 0.190476 720 42.1 ... 14 2 9 27 0.250 5 4 0 0 0
1 12718 NYM 2024 605288 Adrian Houser 31 309 0.145631 451 69.1 ... 44 2 26 19 0.578 17 4 0 0 0
2 25007 LAD 2024 681911 Alex Vesia 28 263 0.330798 1006 66.1 ... 46 4 18 69 0.207 19 5 0 0 0
3 26108 STL 2024 669467 Andre Pallante 25 509 0.184676 731 121.1 ... 51 3 16 78 0.170 12 9 0 0 0
4 15423 TEX 2024 571760 Andrew Heaney 33 693 0.229437 399 160.0 ... 88 8 41 118 0.258 29 5 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
134 25595 - - - 2024 675540 Xzavion Curry 25 172 0.162791 228 42.2 ... 27 4 7 21 0.250 6 0 0 0 0
135 33838 TOR 2024 684320 Yariel RodrĆ­guez 27 368 0.230978 829 86.2 ... 56 2 18 67 0.212 13 5 0 0 0
136 15823 TBR 2024 641793 Zack Littell 28 656 0.214939 570 156.1 ... 111 11 38 103 0.270 24 5 0 0 0
137 10310 PHI 2024 554430 Zack Wheeler 34 787 0.284625 1043 200.0 ... 132 11 71 153 0.317 39 10 0 0 0
138 31827 MIN 2024 805673 Zebby Matthews 24 177 0.242938 627 37.2 ... 21 5 9 34 0.209 5 1 0 0 0

139 rows Ɨ 39 columns

InĀ [4]:
X_df, y_df = cv_utils.pred_X_y(test_df)
X_2024_df, y_2024_df = cv_utils.pred_X_y(test_2024_df)

Load Models¶

InĀ [5]:
# Only lasso features
lr_model = joblib.load(model_utils.MODEL_DIR.joinpath("linear.joblib"))

# All features
rf_model = joblib.load(model_utils.MODEL_DIR.joinpath("randomforest.joblib"))
xgb_model = joblib.load(model_utils.MODEL_DIR.joinpath("xgboost.joblib"))

Evaluate Models¶

InĀ [6]:
def summarize(model, X, y):
    model_name = model.named_steps["regressor"].best_estimator_.__class__.__name__
    preds = model.predict(X)
    mse = mean_squared_error(y, preds)
    score = model.score(X, y)
    print(f"{model_name} {score=:.3f} {mse=:.5f}")
    return preds
InĀ [7]:
# 2024 data
lr_2024_preds = summarize(lr_model, X_2024_df, y_2024_df)
rf_2024_preds = summarize(rf_model, X_2024_df, y_2024_df)
xgb_2024_preds = summarize(xgb_model, X_2024_df, y_2024_df)
LinearRegression score=0.944 mse=0.00018
RandomForestRegressor score=0.926 mse=0.00024
XGBRegressor score=0.935 mse=0.00021

Conclusion¶

The linear model is the winner. Let's take a look at it's results in detail.

InĀ [8]:
lr_preds = summarize(lr_model, X_df, y_df)
LinearRegression score=0.944 mse=0.00018
InĀ [9]:
lr_feature_impr = model_utils.sort_features_by_coefs(
    feature_names=lr_model.feature_names_in_,
    coefs=lr_model.named_steps["regressor"].best_estimator_.coef_,
)
lr_feature_impr
Out[9]:
[('I/Str', -0.052868819639745955),
 ('Pit/PA', -0.014323270309228354),
 ('Con', -0.012448771717174015),
 ('30%', -0.004762331133752689),
 ('L/SO', 0.004409235609710712),
 ('F/Str', -0.0016998836219298695),
 ('Str%', -0.0003509693098396682)]
InĀ [10]:
plot_utils.plot_pred_vs_target(
    X_2024_df,
    y_2024_df,
    lr_2024_preds,
    "LinearRegression (w/ Lasso Features)",
    savepath="../assets/images/linear-pred-vs-target.png",
)
No description has been provided for this image
InĀ [11]:
plot_utils.plot_pred_vs_target(
    X_2024_df,
    y_2024_df,
    lr_2024_preds,
    "LinearRegression (with Lasso Features) Interactive",
    mode="interactive",
    savepath="../assets/images/linear-pred-vs-target.html",
)
loading player ids from /Users/logan/Desktop/repos/mlb-pitcher-xK/data/player_ids.json...
InĀ [12]:
name, mlb_id, fangraphs_id = model_utils.find_delta_extrema(
    X_2024_df,
    y_2024_df,
    lr_2024_preds,
    extrema="max",
)
name, mlb_id, fangraphs_id
loading player ids from /Users/logan/Desktop/repos/mlb-pitcher-xK/data/player_ids.json...
Out[12]:
('Jake Woodford', 663765, 18674)
InĀ [13]:
plot_utils.plot_player(
    "Jake Woodford",
    X_df,
    y_df,
    lr_preds,
)
No description has been provided for this image
xK%: 0.1153
K% : [0.17064846, 0.12765957, 0.13063063, 0.15568862]
InĀ [14]:
name, mlb_id, fangraphs_id = model_utils.find_delta_extrema(
    X_2024_df,
    y_2024_df,
    lr_2024_preds,
    extrema="min",
)
name, mlb_id, fangraphs_id
Out[14]:
('JosƩ Ruiz', 614179, 14552)
InĀ [15]:
plot_utils.plot_player(
    "JosƩ Ruiz",
    X_df,
    y_df,
    lr_preds,
)
No description has been provided for this image
xK%: 0.2396
K% : [0.23247232, 0.25660377, 0.1884058, 0.23963134]
InĀ [16]:
plot_utils.plot_player(
    "Adam Wainwright",
    X_df,
    y_df,
    lr_preds,
    target_year=2023,
    savepath="../assets/images/wainwright-pred.png",
)
No description has been provided for this image
xK%: 0.0938
K% : [0.21014493, 0.17808219, 0.11363636]
InĀ [17]:
plot_utils.plot_player(
    "Sonny Gray", X_df, y_df, lr_preds, savepath="../assets/images/gray-pred.png"
)
No description has been provided for this image
xK%: 0.2903
K% : [0.26956522, 0.2397541, 0.24270557, 0.30253353]
InĀ [18]:
plot_utils.plot_player(
    "Joe Musgrove", X_df, y_df, lr_preds, savepath="../assets/images/musgrove-pred.png"
)
No description has been provided for this image
xK%: 0.2568
K% : [0.27139037, 0.24864865, 0.24310777, 0.24634146]